#!/usr/bin/env python

import argparse
import re
import sys
import os

parser = argparse.ArgumentParser(description="This script takes a gff file and converts it into the format required\
                                              for importing external gene calls and functions into anvi'o. It was\
                                              made based on the gff3 format. Currently, it does not take into account 'call_type',\
                                              and assumes all are for coding sequences. For version info, run `bit-version`.")

required = parser.add_argument_group('required arguments')

required.add_argument("-i", "--input_gff", help='input gff file (e.g. "*.gff")', action="store", dest="input_gff", required=True)
parser.add_argument("-s", "--annotation_source", help='Annotation source (default: "NCBI_PGAP")', action="store", dest="source", default="NCBI_PGAP")
parser.add_argument("-v", "--annotation_version", help='Annotation source version (default: "v4.6")', action="store", dest="version", default="v4.6")
parser.add_argument("-o", "--output_gene_calls_tsv", help='Output tsv file (default: "external_gene_calls.tsv")', action="store", dest="output_gene_calls_tsv", default="external_gene_calls.tsv")
parser.add_argument("-a", "--output_functions_tsv", help='Output functions file (default: "functions.tsv")', action="store", dest="output_functions_tsv", default="functions.tsv")
parser.add_argument("-m", "--output_mappings_tsv", help='Output mapping file for anvio "gene_callers_id"s to protein accessions and locus tags if present (default: "mappings.tsv")', action="store", dest="output_mappings_tsv", default="mappings.tsv")

if len(sys.argv)==1:
    parser.print_help(sys.stderr)
    sys.exit(0)

args = parser.parse_args()

source = args.source
version = args.version

output_gene_calls = open(args.output_gene_calls_tsv, "w")
output_gene_calls.write("gene_callers_id" + "\t" + "contig" + "\t" + "start" + "\t" + "stop" + "\t" + "direction" + "\t" + "partial" + "\t" "call_type" + "\t" + "source" + "\t" + "version" + "\n")

output_functions = open(args.output_functions_tsv, "w")
output_functions.write("gene_callers_id" + "\t" + "source" + "\t" + "accession" + "\t" + "function" + "\t" + "e_value" + "\n")

output_mappings = open(args.output_mappings_tsv, "w")
output_mappings.write("gene_callers_id" +"\t" + "protein_acc" + "\n")

num = 0 # iterator for assigning "gene_callers_id"
max_stop = sys.maxsize # intializing variable to ensure we don't keep gene calls that extend beyond the contig (for those that have a "region" specified)

with open(args.input_gff, "r") as inputs:
    for line in inputs:
        line = line.strip()
        # resetting max_stop position if we passed comment lines (this means we moved onto a new gff file that was catted together)
        # some gff files have "region" lines stating the length of the contig that holds the following genes, others do not, this is so
        # this value is reset on a new gff file to the max, in case this file doesn't have that
        if line.startswith("#"):
            max_stop = sys.maxsize
            continue

        else:
            line_split = line.split("\t")
            
            # setting contig length value, this will enable us to filter out those that extend beyond the end of the contig, if "region" lines are present in the gff file
            if line_split[2] == "region":
                max_stop = line_split[4]
                continue

            if line_split[2] == "CDS":

                if int(line_split[4]) > int(max_stop):
                    continue

                contig = line_split[0].split(" ")[0]
                start = int(line_split[3]) - 1
                stop = line_split[4]

                # setting strand to "f" or "r":
                if line_split[6] == "+":
                    strand="f"
                else:
                    strand="r"

                info = line_split[8].split(";")

                curr_dict = {}

                for item in info:

                    if "=" in item and int(len(item.split("=")) > 1):

                        key = item.split("=")[0]
                        value = item.split("=")[1]

                        curr_dict[key] = value

                # skipping if marked as pseudogene
                if "pseudo" in curr_dict:
                    if curr_dict["pseudo"] == "true":
                        continue

                # skipping if marked as partial
                if "partial" in curr_dict:
                    if curr_dict["partial"] == "true" or curr_dict["partial"] == "01":
                        continue

                if "protein_id" in curr_dict:
                    accession = curr_dict["protein_id"]
                else:
                    accession = "Not_Identified_in_gff_file"

                if "locus_tag" in curr_dict:
                    locus_tag = curr_dict["locus_tag"]



                if "product" in curr_dict:
                    product = curr_dict["product"]
                    product = re.sub('%2C', ',', product) # for some reason the commas in here are "%2C", so changing
                    product = re.sub('%3B', ';', product) # these are dumb too
                    ## some of these have different spaces due to odd characters, fixing to match what i get from my genbank-to-anvio parser
                    product = re.sub(r'([0-9]), ([0-9])' , r'\1,\2', product) 
                    product = re.sub(r"([0-9]),\r([0-9])" , r'\1,\2', product) 
                    product = re.sub(r'([A-Za-z0-9])- ', r'\1-', product)
                    product = re.sub(r' -([A-Za-z0-9])', r'-\1', product)
                    product = product.replace('( ', '(')
                    product = product.replace(' )', ')')


                else:
                    product = "hypothetical protein"

                if "gene" in curr_dict and product != "hypothetical protein":
                    gene_name = curr_dict["gene"]
                    product = str(product) + " (" + gene_name + ")"


                num += 1

                output_gene_calls.write(str(num) + "\t" + str(contig) + "\t" + str(start) + "\t" + str(stop) + "\t" + str(strand) + "\t" + "0" + "\t" + "1" + "\t" + str(source) + "\t" +  str(version) + "\n")
                output_mappings.write(str(num) + "\t" + str(accession) + "\n")

                if "hypothetical protein" in product:
                    continue
                else:
                    output_functions.write(str(num) + "\t" + str(source) + "\t" + str(accession) + "\t" + str(product) + "\t" + "0" + "\n")

output_gene_calls.close()
output_functions.close()
